clear all
set more off

* Assume working directory is already set to Project/

*****************************************
* STEP 1: IMPORT & CLEAN SAINC1 DATA *
*****************************************

* Go to Data/Input Data/
cd "Data/InputData/"

* Import Excel file
import excel "SAINC1.xlsx", sheet("Table") firstrow clear

* Drop any rows that contain variable labels accidentally imported
drop if B == "GeoName"

* Rename variables based on dataset structure
rename B state
rename C linecode
rename D description
rename (E F G H I J K L M) (year_2010 year_2011 year_2012 year_2013 year_2014 year_2015 year_2016 year_2017 year_2018)

* Convert year columns to numeric
destring year_2010-year_2018, replace force

* Standardize state names (remove asterisks and trim spaces)
replace state = trim(state)
replace state = subinstr(state, "*", "", .)

* Save cleaned dataset (move up two folders first)
cd "../.."
save "Data/IntermediateData/SAINC1_clean.dta", replace

*****************************************
* STEP 2: IMPORT & CLEAN SAINC51 DATA *
*****************************************

* Go back to Data/Intermediate Data/
cd "Data/InputData/"

* Import Excel file
import excel "SAINC51.xlsx", sheet("Table") firstrow clear

* Drop any rows that contain variable labels accidentally imported
drop if B == "GeoName"

* Rename variables
rename B state
rename C linecode
rename D description
rename (E F G H I J K L M) (year_2010 year_2011 year_2012 year_2013 year_2014 year_2015 year_2016 year_2017 year_2018)

* Convert year columns to numeric
destring year_2010-year_2018, replace force

* Standardize state names
replace state = trim(state)
replace state = subinstr(state, "*", "", .)

* Save cleaned dataset
cd "../.."
save "Data/IntermediateData/SAINC51_clean.dta", replace


*******************************************
* STEP 3: RESHAPE SAINC1 TO LONG FORMAT  *
*******************************************

* Load cleaned SAINC1 data
use "Data/IntermediateData/SAINC1_clean.dta", clear

* Identify and remove duplicate rows
duplicates report state linecode description
duplicates drop state linecode description, force

* Reshape data to long format
reshape long year_, i(state linecode description) j(year)

* Rename reshaped column
rename year_ income

* Convert year to numeric
destring year, replace

* Save long format
save "Data/IntermediateData/SAINC1_long.dta", replace

*******************************************
* STEP 4: RESHAPE SAINC51 TO LONG FORMAT *
*******************************************

* Load cleaned SAINC51 data
use "Data/IntermediateData/SAINC51_clean.dta", clear

* Identify and remove duplicate rows
duplicates report state linecode description
duplicates drop state linecode description, force

* Reshape data to long format
reshape long year_, i(state linecode description) j(year)

* Rename reshaped column
rename year_ disposable_income

* Convert year to numeric
destring year, replace

* Save long format
save "Data/IntermediateData/SAINC51_long.dta", replace

*******************************************
* FINAL CHECKS *
*******************************************

* Check SAINC1 dataset
use "Data/IntermediateData/SAINC1_long.dta", clear
summarize year income
tab state

* Check SAINC51 dataset
use "Data/IntermediateData/SAINC51_long.dta", clear
summarize year disposable_income
tab state
